Here we present an example analysis of 10 million PBMCs treated with cytokines and processed through Parse Biosciences Evercode workflow. We use the python package Scanpy. We will demonstrate steps starting from loading in the data, to preprocessing, and finally to clustering.¶
In [1]:
import os
import sys
import scipy
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.io as sio
import scanpy.external as sce
import matplotlib.pyplot as plt
import time
sc.settings.verbosity = 1
obj_save_path = '/10M_dataset/analysis/tutorial/'
# Adjust Scanpy figure defaults
sc.settings.set_figure_params(dpi=100, fontsize=10, dpi_save=400,
facecolor = 'white', figsize=(6,6), format='png')
Function to get std across rows in a sparse_csr matrix. This will help with replacing standard Scanpy Scale and PCA functions in order to use less memory.¶
In [2]:
def sparse_std(X,blocksize=100):
"""
Return the std deviation of the columns of a sparse matrix.
inputs
X: sparse matrix
blocksize: number of columns to calculate in parallel. Larger
blocksize will increase memory usage as this function converts
each block to a dense matrix and uses the numpy .std() function
outputs
X_std: numpy vector with std deviation of each column of X
"""
n,m = X.shape
blocksize = 100
k = int(m/blocksize)
X_std = []
for i in range(k):
X_std += list(X[:,blocksize*i:blocksize*(i+1)].todense().std(0).A1)
X_std += list(X[:,k*blocksize:].todense().std(0).A1)
X_std = np.array(X_std)
return X_std
IRLB functions to replace normal scanpy scale and pca steps in order to reduce memory usage¶
In [3]:
# irlbpy code
# From: https://github.com/airysen/irlbpy
# Date: 2021-12
# License: Apache License, V 2.0, January 2004
#
# Code unmodified except:
# Added two print (feedback) blocks
# Ran through lint formatting tool 'black' https://github.com/psf/black
#
import numpy as np
import scipy.sparse as sparse
import warnings
from numpy.fft import rfft, irfft
import numpy.linalg as nla
# Matrix-vector product wrapper
# A is a numpy 2d array or matrix, or a scipy matrix or sparse matrix.
# x is a numpy vector only.
# Compute A.dot(x) if t is False, A.transpose().dot(x) otherwise.
def multA(A, x, TP=False, L=None):
if sparse.issparse(A):
# m = A.shape[0]
# n = A.shape[1]
if TP:
return sparse.csr_matrix(x).dot(A).transpose().todense().A[:, 0]
return A.dot(sparse.csr_matrix(x).transpose()).todense().A[:, 0]
if TP:
return x.dot(A)
return A.dot(x)
def multS(s, v, L, TP=False):
N = s.shape[0]
vp = prepare_v(v, N, L, TP=TP)
p = irfft(rfft(vp) * rfft(s))
if not TP:
return p[:L]
return p[L - 1 :]
def prepare_s(s, L=None):
N = s.shape[0]
if L is None:
L = N // 2
K = N - L + 1
return np.roll(s, K - 1)
def prepare_v(v, N, L, TP=False):
v = v.flatten()[::-1]
K = N - L + 1
if TP:
lencheck = L
if v.shape[0] != lencheck:
raise VectorLengthException(
"Length of v must be L (if transpose flag is True)"
)
pw = K - 1
v = np.pad(v, (pw, 0), mode="constant", constant_values=0)
elif not TP:
lencheck = N - L + 1
if v.shape[0] != lencheck:
raise VectorLengthException("Length of v must be N-K+1")
pw = L - 1
v = np.pad(v, (0, pw), mode="constant", constant_values=0)
return v
def orthog(Y, X):
"""Orthogonalize a vector or matrix Y against the columns of the matrix X.
This function requires that the column dimension of Y is less than X and
that Y and X have the same number of rows.
"""
dotY = multA(X, Y, TP=True)
return Y - multA(X, dotY)
# Simple utility function used to check linear dependencies during computation:
def invcheck(x):
# eps2 = 2 * np.finfo(np.float).eps
eps2 = 2 * np.finfo(float).eps
if x > eps2:
x = 1 / x
else:
x = 0
warnings.warn("Ill-conditioning encountered, result accuracy may be poor")
return x
def lanczos(A, nval, tol=0.0001, maxit=50, center=None, scale=None, L=None):
"""Estimate a few of the largest singular values and corresponding singular
vectors of matrix using the implicitly restarted Lanczos bidiagonalization
method of Baglama and Reichel, see:
Augmented Implicitly Restarted Lanczos Bidiagonalization Methods,
J. Baglama and L. Reichel, SIAM J. Sci. Comput. 2005
Keyword arguments:
tol -- An estimation tolerance. Smaller means more accurate estimates.
maxit -- Maximum number of Lanczos iterations allowed.
Given an input matrix A of dimension j * k, and an input desired number
of singular values n, the function returns a tuple X with five entries:
X[0] A j * nu matrix of estimated left singular vectors.
X[1] A vector of length nu of estimated singular values.
X[2] A k * nu matrix of estimated right singular vectors.
X[3] The number of Lanczos iterations run.
X[4] The number of matrix-vector products run.
The algorithm estimates the truncated singular value decomposition:
A.dot(X[2]) = X[0]*X[1].
"""
import sys
print(
f">> lanczos A={A.shape}, nval={nval}, tol={tol}, maxit={maxit}",
file=sys.stdout,
)
center_story = "None" if center is None else f"{center.shape}"
scale_story = "None" if scale is None else f"{scale.shape}"
print(f"++ lanczos center={center_story}, scale={scale_story}", file=sys.stdout)
sys.stdout.flush()
mmult = None
m = None
n = None
if A.ndim == 2:
mmult = multA
m = A.shape[0]
n = A.shape[1]
if min(m, n) < 2:
raise MatrixShapeException("The input matrix must be at least 2x2.")
elif A.ndim == 1:
mmult = multS
A = np.pad(A, (0, A.shape[0] % 2), mode="edge")
N = A.shape[0]
if L is None:
L = N // 2
K = N - L + 1
m = L
n = K
A = prepare_s(A, L)
elif A.ndim > 2:
raise MatrixShapeException("The input matrix must be 2D array")
nu = nval
m_b = min((nu + 20, 3 * nu, n)) # Working dimension size
mprod = 0
it = 0
j = 0
k = nu
smax = 1
# sparse = sparse.issparse(A)
V = np.zeros((n, m_b))
W = np.zeros((m, m_b))
F = np.zeros((n, 1))
B = np.zeros((m_b, m_b))
V[:, 0] = np.random.randn(n) # Initial vector
V[:, 0] = V[:, 0] / np.linalg.norm(V)
while it < maxit:
if it > 0:
j = k
VJ = V[:, j]
# apply scaling
if scale is not None:
VJ = VJ / scale
W[:, j] = mmult(A, VJ, L=L)
mprod = mprod + 1
# apply centering
# R code: W[, j_w] <- W[, j_w] - ds * drop(cross(dv, VJ)) * du
if center is not None:
W[:, j] = W[:, j] - np.dot(center, VJ)
if it > 0:
# NB W[:,0:j] selects columns 0,1,...,j-1
W[:, j] = orthog(W[:, j], W[:, 0:j])
s = np.linalg.norm(W[:, j])
sinv = invcheck(s)
W[:, j] = sinv * W[:, j]
# Lanczos process
while j < m_b:
F = mmult(A, W[:, j], TP=True, L=L)
mprod = mprod + 1
# apply scaling
if scale is not None:
F = F / scale
F = F - s * V[:, j]
F = orthog(F, V[:, 0 : j + 1])
fn = np.linalg.norm(F)
fninv = invcheck(fn)
F = fninv * F
if j < m_b - 1:
V[:, j + 1] = F
B[j, j] = s
B[j, j + 1] = fn
VJp1 = V[:, j + 1]
# apply scaling
if scale is not None:
VJp1 = VJp1 / scale
W[:, j + 1] = mmult(A, VJp1, L=L)
mprod = mprod + 1
# apply centering
# R code: W[, jp1_w] <- W[, jp1_w] - ds * drop(cross(dv, VJP1))
# * du
if center is not None:
W[:, j + 1] = W[:, j + 1] - np.dot(center, VJp1)
# One step of classical Gram-Schmidt...
W[:, j + 1] = W[:, j + 1] - fn * W[:, j]
# ...with full reorthogonalization
W[:, j + 1] = orthog(W[:, j + 1], W[:, 0 : (j + 1)])
s = np.linalg.norm(W[:, j + 1])
sinv = invcheck(s)
W[:, j + 1] = sinv * W[:, j + 1]
else:
B[j, j] = s
j = j + 1
# End of Lanczos process
S = nla.svd(B)
R = fn * S[0][m_b - 1, :] # Residuals
if it == 0:
smax = S[1][0] # Largest Ritz value
else:
smax = max((S[1][0], smax))
conv = sum(np.abs(R[0:nu]) < tol * smax)
if conv < nu: # Not coverged yet
k = max(conv + nu, k)
k = min(k, m_b - 3)
else:
break
# Update the Ritz vectors
V[:, 0:k] = V[:, 0:m_b].dot(S[2].transpose()[:, 0:k])
V[:, k] = F
B = np.zeros((m_b, m_b))
# Improve this! There must be better way to assign diagonal...
for l in range(k):
B[l, l] = S[1][l]
B[0:k, k] = R[0:k]
# Update the left approximate singular vectors
W[:, 0:k] = W[:, 0:m_b].dot(S[0][:, 0:k])
it = it + 1
U = W[:, 0:m_b].dot(S[0][:, 0:nu])
V = V[:, 0:m_b].dot(S[2].transpose()[:, 0:nu])
# return((U, S[1][0:nu], V, it, mprod))
print(f"<< lanczos it={it} mprod={mprod}", file=sys.stdout)
sys.stdout.flush()
return LanczosResult(
**{"U": U, "s": S[1][0:nu], "V": V, "steps": it, "nmult": mprod}
)
class LanczosResult:
def __init__(self, **kwargs):
for key in kwargs:
setattr(self, key, kwargs[key])
class VectorLengthException(Exception):
pass
class MatrixShapeException(Exception):
pass
Read in anndata object containing about 10 million PBMCs that have been treated with various cytokines and processed through the Parse Biosciences Evercode workflow in a single experiment.¶
In [4]:
%%time
adata = sc.read_h5ad(obj_save_path + "Parse_10M_PBMC_cytokines.h5ad")
CPU times: user 25.9 s, sys: 1min 36s, total: 2min 2s Wall time: 4min 40s
In [5]:
adata
Out[5]:
AnnData object with n_obs × n_vars = 9697974 × 40352
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
var: 'n_cells'
Log normalize the counts matrix¶
In [6]:
%%time
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
CPU times: user 32.9 s, sys: 28.5 s, total: 1min 1s Wall time: 1min 1s
In [7]:
adata
Out[7]:
AnnData object with n_obs × n_vars = 9697974 × 40352
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
var: 'n_cells'
uns: 'log1p'
Determine the highly variable genes and then subset the AnnData object on these genes¶
In [8]:
%%time
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.25)
sc.pl.highly_variable_genes(adata)
CPU times: user 3min 24s, sys: 1min 27s, total: 4min 52s Wall time: 2min 32s
In [9]:
%%time
adata = adata[:, adata.var.highly_variable].copy()
CPU times: user 1min 37s, sys: 10.5 s, total: 1min 47s Wall time: 1min 47s
In [10]:
adata
Out[10]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'log1p', 'hvg'
In [11]:
%%time
adata.write_h5ad(obj_save_path + "adata_post_hvg.h5ad")
CPU times: user 4.65 s, sys: 27.5 s, total: 32.1 s Wall time: 32.5 s
Replace scanpy.pp.scale and scanpy.pp.pca with the following code to use less memory¶
In [12]:
%%time
use_hv = True
adata.uns['pca'] = {}
adata.uns['pca']['params'] = {
'zero_center': True,
'use_highly_variable': use_hv,
}
adata.var['mean'] = adata.X.mean(0).A1
adata.var['std'] = sparse_std(adata.X)
S = lanczos(adata.X,50,center=adata.var['mean'],scale=adata.var['std'])
adata.obsm['X_pca'] = (S.U * S.s)
adata.varm['PCs'] = S.V
adata.uns['pca']['variance'] = adata.obsm['X_pca'].var(0)
adata.uns['pca']['variance_ratio'] = adata.uns['pca']['variance'] / (adata.X.shape[1] - 1 )
>> lanczos A=(9697974, 5412), nval=50, tol=0.0001, maxit=50 ++ lanczos center=(5412,), scale=(5412,) << lanczos it=16 mprod=236 CPU times: user 1h 54min 28s, sys: 26min 59s, total: 2h 21min 28s Wall time: 1h 22min 19s
In [13]:
adata.write_h5ad(obj_save_path + 'adata_post_pca.h5ad')
In [14]:
adata
Out[14]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'log1p', 'hvg', 'pca'
obsm: 'X_pca'
varm: 'PCs'
Continue with neighbors, umap, and leiden¶
In [15]:
%%time
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
/home/ubuntu/miniconda3/envs/spipe3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
CPU times: user 35min 8s, sys: 1min 29s, total: 36min 37s Wall time: 35min 59s
In [16]:
adata
Out[16]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'log1p', 'hvg', 'pca', 'neighbors'
obsm: 'X_pca'
varm: 'PCs'
obsp: 'distances', 'connectivities'
In [17]:
%%time
sc.tl.umap(adata)
CPU times: user 4h 38min 12s, sys: 2min 53s, total: 4h 41min 5s Wall time: 2h 18min 48s
In [18]:
adata
Out[18]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
In [19]:
adata.write_h5ad(obj_save_path + 'adata_post_umap.h5ad')
In [22]:
%%time
sc.tl.leiden(adata, resolution=1.0, flavor="igraph", n_iterations=2)
CPU times: user 12min 44s, sys: 1min 38s, total: 14min 23s Wall time: 14min 14s
In [23]:
adata
Out[23]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
In [24]:
%%time
adata.write_h5ad(obj_save_path + 'adata_post_leiden.h5ad')
CPU times: user 5.67 s, sys: 31.6 s, total: 37.2 s Wall time: 37.8 s
Plotting¶
In [25]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8)
CPU times: user 22.1 s, sys: 1.01 s, total: 23.1 s Wall time: 23.1 s
In [26]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8, legend_loc = "on data")
CPU times: user 22.6 s, sys: 1.05 s, total: 23.6 s Wall time: 23.6 s
In [27]:
%%time
sc.pl.umap(adata, color='donor', legend_fontsize=8)
CPU times: user 22 s, sys: 985 ms, total: 23 s Wall time: 23 s
If it is desirable to remove the batch effects of having different PBMC donors, one can run harmony integration. Due to technical variation and differences in underlying biology between PBMC donors, integrating with harmony can help users more easily identify which clusters correspond to which cell type.¶
In [2]:
%%time
adata = sc.read(obj_save_path + "adata_post_leiden.h5ad")
adata
CPU times: user 18.8 s, sys: 1min 24s, total: 1min 43s Wall time: 7min 26s
Out[2]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
In [3]:
%%time
sce.pp.harmony_integrate(adata, 'donor')
2024-10-31 16:46:56,372 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... 2024-10-31 16:52:35,535 - harmonypy - INFO - sklearn.KMeans initialization complete. 2024-10-31 16:53:01,809 - harmonypy - INFO - Iteration 1 of 10 2024-10-31 17:36:44,967 - harmonypy - INFO - Iteration 2 of 10 2024-10-31 18:33:58,711 - harmonypy - INFO - Iteration 3 of 10 2024-10-31 19:24:56,606 - harmonypy - INFO - Iteration 4 of 10 2024-10-31 20:19:26,464 - harmonypy - INFO - Iteration 5 of 10 2024-10-31 21:05:19,316 - harmonypy - INFO - Iteration 6 of 10 2024-10-31 21:49:58,713 - harmonypy - INFO - Iteration 7 of 10 2024-10-31 22:04:55,678 - harmonypy - INFO - Converged after 7 iterations
CPU times: user 1d 21min 24s, sys: 56min 47s, total: 1d 1h 18min 12s Wall time: 5h 18min 4s
In [4]:
%%time
adata.write_h5ad(obj_save_path + 'adata_post_harmony.h5ad')
CPU times: user 12.8 s, sys: 48.8 s, total: 1min 1s Wall time: 1min 3s
In [ ]:
Continue with neighbors. To use the results of harmony integration, we set use_rep = "X_pca_harmony"¶
In [5]:
%%time
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30, use_rep="X_pca_harmony")
/home/ubuntu/miniconda3/envs/spipe3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
CPU times: user 46min 16s, sys: 2min 43s, total: 49min Wall time: 47min 13s
In [6]:
adata
Out[6]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
varm: 'PCs'
obsp: 'connectivities', 'distances'
Continue with umap and leiden clustering.¶
In [7]:
%%time
sc.tl.umap(adata)
CPU times: user 4h 33min 44s, sys: 4min 13s, total: 4h 37min 57s Wall time: 2h 25min 24s
In [8]:
adata
Out[8]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
varm: 'PCs'
obsp: 'connectivities', 'distances'
In [9]:
%%time
adata.write_h5ad(obj_save_path + 'adata_post_harmony_post_umap.h5ad')
CPU times: user 8.69 s, sys: 1min 19s, total: 1min 28s Wall time: 1min 30s
In [10]:
%%time
sc.tl.leiden(adata, resolution=1.0, flavor="igraph", n_iterations=2)
CPU times: user 14min 2s, sys: 46.1 s, total: 14min 48s Wall time: 14min 38s
In [11]:
adata
Out[11]:
AnnData object with n_obs × n_vars = 9697974 × 5412
obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
varm: 'PCs'
obsp: 'connectivities', 'distances'
In [12]:
%%time
adata.write_h5ad(obj_save_path + 'adata_post_harmony_post_leiden.h5ad')
CPU times: user 10.6 s, sys: 47.5 s, total: 58.1 s Wall time: 59.7 s
In [13]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8, legend_loc = "on data")
CPU times: user 24.5 s, sys: 1.33 s, total: 25.8 s Wall time: 25.8 s
In [14]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8)
CPU times: user 24.5 s, sys: 862 ms, total: 25.4 s Wall time: 25.4 s
We now plot by donor to show the results of harmony integration.¶
In [15]:
%%time
sc.pl.umap(adata, color='donor', legend_fontsize=8)
CPU times: user 24.5 s, sys: 833 ms, total: 25.3 s Wall time: 25.3 s
The first and most notable effect of running the integration function to is seeing how the samples overlap with another, which simplifies the process of annotating cell types as it reduces the number of clusters. In this case, we don't see a drastic reduction in numbers of clusters, but in scenarios where we see greater technical and biological variation, this can have a much greater impact.¶
We now overlay cell type annotations on the umap plot, which were determined using standard marker genes for PBMCs.¶
In [16]:
%%time
sc.pl.umap(adata, color='cell_type', legend_fontsize=8, legend_loc = "on data")
CPU times: user 24.9 s, sys: 954 ms, total: 25.9 s Wall time: 25.9 s
In [17]:
%%time
sc.pl.umap(adata, color='cell_type', legend_fontsize=8)
CPU times: user 24.7 s, sys: 937 ms, total: 25.7 s Wall time: 25.6 s
We can also plot the cytokines¶
In [4]:
%%time
sc.pl.umap(adata, color='cytokine', legend_fontsize=8)
CPU times: user 25.1 s, sys: 430 ms, total: 25.5 s Wall time: 25.6 s
In [ ]: